#!/usr/bin/env python
# -*- coding:utf8 -*-

"""
@version: 
@author: lh
@license: Apache Licence 
@contact: liuhuan0672@gmail.com
@site: 
@software: PyCharm
@file: 0008.py
@time: 2016/1/20 15:28

第 0008 题：一个HTML文件，找出里面的正文。
"""
import re
import urllib.request


def get_html_body(url):
    html_content = urllib.request.urlopen(url).read()
    r = re.compile('<p>(?:<.[^>]*>)?(.*?)(?:<.[^>]*>)?</p>')
    result = r.findall(html_content.decode('GBK'))
    return result


if __name__ == '__main__':
    body = get_html_body('http://tech.163.com/14/1219/01/ADPT7MTE000915BF.html')
    file_obj = open('./result/0008.txt', 'w', encoding='utf-8')
    for l in body:
        file_obj.writelines(l + '\n')
    file_obj.close()
